From 67105aaa6009604553f60c13d7ab3baabc1ae663 Mon Sep 17 00:00:00 2001
From: Haidong Lan <haidonglan@taichi.graphics>
Date: Thu, 14 Jul 2022 22:39:59 +0800
Subject: [PATCH 01/11] Implement shared memory for CDUA

---
 python/taichi/lang/ast/ast_transformer.py |  1 -
 python/taichi/lang/expr.py                |  1 +
 python/taichi/lang/impl.py                |  8 ++++++++
 python/taichi/lang/simt/block.py          | 14 +++++++++++++-
 taichi/codegen/llvm/codegen_llvm.cpp      | 22 +++++++++++++++++++++-
 taichi/ir/frontend_ir.cpp                 | 10 ++++++++++
 taichi/ir/frontend_ir.h                   | 11 ++++++++---
 taichi/ir/statements.h                    | 12 ++++++++----
 taichi/python/export_lang.cpp             |  1 +
 taichi/transforms/ir_printer.cpp          |  8 +++++---
 taichi/transforms/lower_ast.cpp           |  2 +-
 11 files changed, 76 insertions(+), 14 deletions(-)

diff --git a/python/taichi/lang/ast/ast_transformer.py b/python/taichi/lang/ast/ast_transformer.py
index e565045cb109c..76015ac9efd94 100644
--- a/python/taichi/lang/ast/ast_transformer.py
+++ b/python/taichi/lang/ast/ast_transformer.py
@@ -97,7 +97,6 @@ def build_Assign(ctx, node):
         # Ref https://github.com/taichi-dev/taichi/issues/2659.
         values = node.value.ptr if is_static_assign else impl.expr_init(
             node.value.ptr)
-
         for node_target in node.targets:
             ASTTransformer.build_assign_unpack(ctx, node_target, values,
                                                is_static_assign)
diff --git a/python/taichi/lang/expr.py b/python/taichi/lang/expr.py
index 2f60e86aeb284..d7814b798b96c 100644
--- a/python/taichi/lang/expr.py
+++ b/python/taichi/lang/expr.py
@@ -7,6 +7,7 @@
 from taichi.types.primitive_types import integer_types, real_types
 
 
+
 # Scalar, basic data type
 class Expr(TaichiOperations):
     """A Python-side Expr wrapper, whose member variable `ptr` is an instance of C++ Expr class. A C++ Expr object contains member variable `expr` which holds an instance of C++ Expression class."""
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index 43940fce55439..10a671c49359e 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -19,6 +19,7 @@
                               MeshRelationAccessProxy,
                               MeshReorderedMatrixFieldProxy,
                               MeshReorderedScalarFieldProxy, element_type_name)
+from taichi.lang.simt.block import SharedArray
 from taichi.lang.snode import SNode
 from taichi.lang.struct import Struct, StructField, _IntermediateStruct
 from taichi.lang.util import (cook_dtype, get_traceback, is_taichi_class,
@@ -26,11 +27,16 @@
 from taichi.types.primitive_types import all_types, f16, f32, f64, i32, i64
 
 
+
 @taichi_scope
 def expr_init_local_tensor(shape, element_type, elements):
     return get_runtime().prog.current_ast_builder().expr_alloca_local_tensor(
         shape, element_type, elements)
 
+@taichi_scope
+def expr_init_scratch_pad(shape, element_type):
+    return get_runtime().prog.current_ast_builder().expr_alloca_scratch_pad(
+        shape, element_type)
 
 @taichi_scope
 def expr_init(rhs):
@@ -40,6 +46,8 @@ def expr_init(rhs):
         return type(rhs)(*rhs.to_list())
     if isinstance(rhs, Matrix):
         return Matrix(rhs.to_list())
+    if isinstance(rhs, SharedArray):
+        return rhs
     if isinstance(rhs, Struct):
         return Struct(rhs.to_dict(include_methods=True))
     if isinstance(rhs, list):
diff --git a/python/taichi/lang/simt/block.py b/python/taichi/lang/simt/block.py
index 427664473dd5d..390376c895d64 100644
--- a/python/taichi/lang/simt/block.py
+++ b/python/taichi/lang/simt/block.py
@@ -1,5 +1,17 @@
 from taichi.lang import impl
-
+from taichi.lang.util import taichi_scope
 
 def sync():
     return impl.call_internal("block_barrier", with_runtime_context=False)
+
+
+class SharedArray:
+    _is_taichi_class = True
+    def __init__(self, shape, dtype):
+        self.shape = shape
+        self.dtype = dtype
+        self.shared_array_proxy = impl.expr_init_scratch_pad(shape, dtype)
+
+    @taichi_scope
+    def _subscript(self, indices, get_ref=False):
+        return impl.make_index_expr(self.shared_array_proxy, (indices,))
\ No newline at end of file
diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index 6966c81f44d84..d33e563987c3e 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -128,7 +128,27 @@ void CodeGenLLVM::visit(AllocaStmt *stmt) {
     auto type = tlctx->get_data_type(tensor_type->get_element_type());
     auto array_size = tlctx->get_constant(tensor_type->get_num_elements());
     // Return type is [array_size x type]*.
-    llvm_val[stmt] = create_entry_block_alloca(type, 0, array_size);
+    if (stmt->is_shared) {
+      size_t data_element_size = tlctx->get_type_size(tlctx->get_data_type(tensor_type->get_element_type()));
+      auto type = llvm::ArrayType::get(llvm::Type::getInt8Ty(*llvm_context),
+                                        data_element_size * tensor_type->get_num_elements());
+      auto base = new llvm::GlobalVariable(
+          *module, type, false, llvm::GlobalValue::ExternalLinkage, nullptr,
+          fmt::format("shared_array_{}", stmt->id), nullptr, llvm::GlobalVariable::NotThreadLocal,
+          3 /*addrspace=shared*/);
+      base->setAlignment(llvm::MaybeAlign(8));
+
+      auto ptr = builder->CreateGEP(
+#ifdef TI_LLVM_15
+          base->getValueType(),
+#endif
+          base, {tlctx->get_constant(0), tlctx->get_constant(0)});
+      auto ptr_type = llvm::PointerType::get(
+          tlctx->get_data_type(tensor_type->get_element_type()), 0);
+        llvm_val[stmt] = builder->CreatePointerCast(ptr, ptr_type);
+    } else {
+      llvm_val[stmt] = create_entry_block_alloca(type, 0, array_size);
+    }
   } else {
     TI_ASSERT(stmt->width() == 1);
     llvm_val[stmt] =
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index f2c03ea18c99d..15d52fff10b30 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -985,6 +985,16 @@ Expr ASTBuilder::expr_alloca_local_tensor(const std::vector<int> &shape,
   return var;
 }
 
+Expr ASTBuilder::expr_alloca_scratch_pad(const std::vector<int> &shape,
+                                          const DataType &element_type) {
+  auto var = Expr(std::make_shared<IdExpression>(get_next_id()));
+  this->insert(std::make_unique<FrontendAllocaStmt>(
+      std::static_pointer_cast<IdExpression>(var.expr)->id, shape,
+      element_type, true));
+  var->ret_type = this->get_last_stmt()->ret_type;
+  return var;
+}
+
 void ASTBuilder::expr_assign(const Expr &lhs, const Expr &rhs, std::string tb) {
   TI_ASSERT(lhs->is_lvalue());
   auto stmt = std::make_unique<FrontendAssignStmt>(lhs, rhs);
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index fdbe86c95c620..bb4fbba305408 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -63,17 +63,20 @@ class FrontendAllocaStmt : public Stmt {
  public:
   Identifier ident;
 
-  FrontendAllocaStmt(const Identifier &lhs, DataType type) : ident(lhs) {
+  FrontendAllocaStmt(const Identifier &lhs, DataType type) : ident(lhs), is_shared(false) {
     ret_type = TypeFactory::create_vector_or_scalar_type(1, type);
   }
 
   FrontendAllocaStmt(const Identifier &lhs,
                      std::vector<int> shape,
-                     DataType element)
-      : ident(lhs) {
+                     DataType element, 
+                     bool is_shared=false)
+      : ident(lhs), is_shared(is_shared) {
     ret_type = DataType(TypeFactory::create_tensor_type(shape, element));
   }
 
+  bool is_shared;
+
   TI_DEFINE_ACCEPT
 };
 
@@ -874,6 +877,8 @@ class ASTBuilder {
   Expr expr_alloca_local_tensor(const std::vector<int> &shape,
                                 const DataType &element_type,
                                 const ExprGroup &elements);
+  Expr expr_alloca_scratch_pad(const std::vector<int> &shape,
+                               const DataType &element_type);
   void expr_assign(const Expr &lhs, const Expr &rhs, std::string tb);
   void create_assert_stmt(const Expr &cond,
                           const std::string &msg,
diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
index 0ffa885f1fb27..f0409c6f6700c 100644
--- a/taichi/ir/statements.h
+++ b/taichi/ir/statements.h
@@ -18,17 +18,20 @@ class Function;
  */
 class AllocaStmt : public Stmt {
  public:
-  AllocaStmt(DataType type) {
+  AllocaStmt(DataType type) : is_shared(false) {
     ret_type = TypeFactory::create_vector_or_scalar_type(1, type);
     TI_STMT_REG_FIELDS;
   }
 
-  AllocaStmt(int width, DataType type) {
+  AllocaStmt(int width, DataType type) : is_shared(false)  {
     ret_type = TypeFactory::create_vector_or_scalar_type(width, type);
     TI_STMT_REG_FIELDS;
   }
 
-  AllocaStmt(const std::vector<int> &shape, DataType type) {
+  AllocaStmt(const std::vector<int> &shape,
+             DataType type,
+             bool is_shared = false)
+      : is_shared(is_shared) {
     ret_type = TypeFactory::create_tensor_type(shape, type);
     TI_STMT_REG_FIELDS;
   }
@@ -41,7 +44,8 @@ class AllocaStmt : public Stmt {
     return false;
   }
 
-  TI_STMT_DEF_FIELDS(ret_type);
+  bool is_shared;
+  TI_STMT_DEF_FIELDS(ret_type, is_shared);
   TI_DEFINE_ACCEPT_AND_CLONE
 };
 
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index a2ea36d1ecfc2..609ebd36c931e 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -303,6 +303,7 @@ void export_lang(py::module &m) {
       .def("insert_external_func_call", &ASTBuilder::insert_external_func_call)
       .def("expr_alloca", &ASTBuilder::expr_alloca)
       .def("expr_alloca_local_tensor", &ASTBuilder::expr_alloca_local_tensor)
+      .def("expr_alloca_scratch_pad", &ASTBuilder::expr_alloca_scratch_pad)
       .def("create_assert_stmt", &ASTBuilder::create_assert_stmt)
       .def("expr_assign", &ASTBuilder::expr_assign)
       .def("begin_frontend_range_for", &ASTBuilder::begin_frontend_range_for)
diff --git a/taichi/transforms/ir_printer.cpp b/taichi/transforms/ir_printer.cpp
index eb94695e83125..76eb3f28153b9 100644
--- a/taichi/transforms/ir_printer.cpp
+++ b/taichi/transforms/ir_printer.cpp
@@ -119,8 +119,9 @@ class IRPrinter : public IRVisitor {
   }
 
   void visit(FrontendAllocaStmt *alloca) override {
-    print("{}${} = alloca {}", alloca->type_hint(), alloca->id,
-          alloca->ident.name());
+    std::string shared_suffix = (alloca->is_shared) ? " shared" : "";
+    print("{}${} = alloca {}{}", alloca->type_hint(), alloca->id,
+          alloca->ident.name(), shared_suffix);
   }
 
   void visit(FrontendAssertStmt *assert) override {
@@ -187,7 +188,8 @@ class IRPrinter : public IRVisitor {
   }
 
   void visit(AllocaStmt *alloca) override {
-    print("{}${} = alloca", alloca->type_hint(), alloca->id);
+    std::string shared_suffix = (alloca->is_shared) ? " shared" : "";
+    print("{}${} = alloca{}", alloca->type_hint(), alloca->id, shared_suffix);
   }
 
   void visit(RandStmt *stmt) override {
diff --git a/taichi/transforms/lower_ast.cpp b/taichi/transforms/lower_ast.cpp
index e7c390ce3c41e..fd339bae354e4 100644
--- a/taichi/transforms/lower_ast.cpp
+++ b/taichi/transforms/lower_ast.cpp
@@ -71,7 +71,7 @@ class LowerAST : public IRVisitor {
     if (stmt->ret_type->is<TensorType>()) {
       auto tensor_type = stmt->ret_type->cast<TensorType>();
       auto lowered = std::make_unique<AllocaStmt>(
-          tensor_type->get_shape(), tensor_type->get_element_type());
+          tensor_type->get_shape(), tensor_type->get_element_type(), stmt->is_shared);
       block->local_var_to_stmt.insert(std::make_pair(ident, lowered.get()));
       stmt->parent->replace_with(stmt, std::move(lowered));
     } else {

From 88bcdaf7b635aedbd26e76b8325545d8f3324ae3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 14 Jul 2022 16:18:29 +0000
Subject: [PATCH 02/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 python/taichi/lang/expr.py           |  1 -
 python/taichi/lang/impl.py           |  3 ++-
 python/taichi/lang/simt/block.py     |  4 +++-
 taichi/codegen/llvm/codegen_llvm.cpp | 14 ++++++++------
 taichi/ir/frontend_ir.cpp            |  6 +++---
 taichi/ir/frontend_ir.h              |  7 ++++---
 taichi/ir/statements.h               |  2 +-
 taichi/transforms/lower_ast.cpp      |  3 ++-
 8 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/python/taichi/lang/expr.py b/python/taichi/lang/expr.py
index d7814b798b96c..2f60e86aeb284 100644
--- a/python/taichi/lang/expr.py
+++ b/python/taichi/lang/expr.py
@@ -7,7 +7,6 @@
 from taichi.types.primitive_types import integer_types, real_types
 
 
-
 # Scalar, basic data type
 class Expr(TaichiOperations):
     """A Python-side Expr wrapper, whose member variable `ptr` is an instance of C++ Expr class. A C++ Expr object contains member variable `expr` which holds an instance of C++ Expression class."""
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index 10a671c49359e..9349b346af93a 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -27,17 +27,18 @@
 from taichi.types.primitive_types import all_types, f16, f32, f64, i32, i64
 
 
-
 @taichi_scope
 def expr_init_local_tensor(shape, element_type, elements):
     return get_runtime().prog.current_ast_builder().expr_alloca_local_tensor(
         shape, element_type, elements)
 
+
 @taichi_scope
 def expr_init_scratch_pad(shape, element_type):
     return get_runtime().prog.current_ast_builder().expr_alloca_scratch_pad(
         shape, element_type)
 
+
 @taichi_scope
 def expr_init(rhs):
     if rhs is None:
diff --git a/python/taichi/lang/simt/block.py b/python/taichi/lang/simt/block.py
index 390376c895d64..97b698b6bf164 100644
--- a/python/taichi/lang/simt/block.py
+++ b/python/taichi/lang/simt/block.py
@@ -1,12 +1,14 @@
 from taichi.lang import impl
 from taichi.lang.util import taichi_scope
 
+
 def sync():
     return impl.call_internal("block_barrier", with_runtime_context=False)
 
 
 class SharedArray:
     _is_taichi_class = True
+
     def __init__(self, shape, dtype):
         self.shape = shape
         self.dtype = dtype
@@ -14,4 +16,4 @@ def __init__(self, shape, dtype):
 
     @taichi_scope
     def _subscript(self, indices, get_ref=False):
-        return impl.make_index_expr(self.shared_array_proxy, (indices,))
\ No newline at end of file
+        return impl.make_index_expr(self.shared_array_proxy, (indices, ))
diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index d33e563987c3e..bb357a77de202 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -129,13 +129,15 @@ void CodeGenLLVM::visit(AllocaStmt *stmt) {
     auto array_size = tlctx->get_constant(tensor_type->get_num_elements());
     // Return type is [array_size x type]*.
     if (stmt->is_shared) {
-      size_t data_element_size = tlctx->get_type_size(tlctx->get_data_type(tensor_type->get_element_type()));
-      auto type = llvm::ArrayType::get(llvm::Type::getInt8Ty(*llvm_context),
-                                        data_element_size * tensor_type->get_num_elements());
+      size_t data_element_size = tlctx->get_type_size(
+          tlctx->get_data_type(tensor_type->get_element_type()));
+      auto type = llvm::ArrayType::get(
+          llvm::Type::getInt8Ty(*llvm_context),
+          data_element_size * tensor_type->get_num_elements());
       auto base = new llvm::GlobalVariable(
           *module, type, false, llvm::GlobalValue::ExternalLinkage, nullptr,
-          fmt::format("shared_array_{}", stmt->id), nullptr, llvm::GlobalVariable::NotThreadLocal,
-          3 /*addrspace=shared*/);
+          fmt::format("shared_array_{}", stmt->id), nullptr,
+          llvm::GlobalVariable::NotThreadLocal, 3 /*addrspace=shared*/);
       base->setAlignment(llvm::MaybeAlign(8));
 
       auto ptr = builder->CreateGEP(
@@ -145,7 +147,7 @@ void CodeGenLLVM::visit(AllocaStmt *stmt) {
           base, {tlctx->get_constant(0), tlctx->get_constant(0)});
       auto ptr_type = llvm::PointerType::get(
           tlctx->get_data_type(tensor_type->get_element_type()), 0);
-        llvm_val[stmt] = builder->CreatePointerCast(ptr, ptr_type);
+      llvm_val[stmt] = builder->CreatePointerCast(ptr, ptr_type);
     } else {
       llvm_val[stmt] = create_entry_block_alloca(type, 0, array_size);
     }
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index 15d52fff10b30..6b7927617cff9 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -986,11 +986,11 @@ Expr ASTBuilder::expr_alloca_local_tensor(const std::vector<int> &shape,
 }
 
 Expr ASTBuilder::expr_alloca_scratch_pad(const std::vector<int> &shape,
-                                          const DataType &element_type) {
+                                         const DataType &element_type) {
   auto var = Expr(std::make_shared<IdExpression>(get_next_id()));
   this->insert(std::make_unique<FrontendAllocaStmt>(
-      std::static_pointer_cast<IdExpression>(var.expr)->id, shape,
-      element_type, true));
+      std::static_pointer_cast<IdExpression>(var.expr)->id, shape, element_type,
+      true));
   var->ret_type = this->get_last_stmt()->ret_type;
   return var;
 }
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index bb4fbba305408..e2a7f8e84613d 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -63,14 +63,15 @@ class FrontendAllocaStmt : public Stmt {
  public:
   Identifier ident;
 
-  FrontendAllocaStmt(const Identifier &lhs, DataType type) : ident(lhs), is_shared(false) {
+  FrontendAllocaStmt(const Identifier &lhs, DataType type)
+      : ident(lhs), is_shared(false) {
     ret_type = TypeFactory::create_vector_or_scalar_type(1, type);
   }
 
   FrontendAllocaStmt(const Identifier &lhs,
                      std::vector<int> shape,
-                     DataType element, 
-                     bool is_shared=false)
+                     DataType element,
+                     bool is_shared = false)
       : ident(lhs), is_shared(is_shared) {
     ret_type = DataType(TypeFactory::create_tensor_type(shape, element));
   }
diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
index f0409c6f6700c..02937e9849d1f 100644
--- a/taichi/ir/statements.h
+++ b/taichi/ir/statements.h
@@ -23,7 +23,7 @@ class AllocaStmt : public Stmt {
     TI_STMT_REG_FIELDS;
   }
 
-  AllocaStmt(int width, DataType type) : is_shared(false)  {
+  AllocaStmt(int width, DataType type) : is_shared(false) {
     ret_type = TypeFactory::create_vector_or_scalar_type(width, type);
     TI_STMT_REG_FIELDS;
   }
diff --git a/taichi/transforms/lower_ast.cpp b/taichi/transforms/lower_ast.cpp
index fd339bae354e4..90e33cba4e01b 100644
--- a/taichi/transforms/lower_ast.cpp
+++ b/taichi/transforms/lower_ast.cpp
@@ -71,7 +71,8 @@ class LowerAST : public IRVisitor {
     if (stmt->ret_type->is<TensorType>()) {
       auto tensor_type = stmt->ret_type->cast<TensorType>();
       auto lowered = std::make_unique<AllocaStmt>(
-          tensor_type->get_shape(), tensor_type->get_element_type(), stmt->is_shared);
+          tensor_type->get_shape(), tensor_type->get_element_type(),
+          stmt->is_shared);
       block->local_var_to_stmt.insert(std::make_pair(ident, lowered.get()));
       stmt->parent->replace_with(stmt, std::move(lowered));
     } else {

From a33f7848662e27989f2328e9376d74f9f5218b08 Mon Sep 17 00:00:00 2001
From: Haidong Lan <haidonglan@taichi.graphics>
Date: Mon, 18 Jul 2022 11:08:39 +0800
Subject: [PATCH 03/11] Minor update for name consistency

---
 python/taichi/lang/ast/ast_transformer.py | 1 +
 python/taichi/lang/impl.py                | 2 +-
 python/taichi/lang/simt/__init__.py       | 1 +
 python/taichi/lang/simt/block.py          | 3 ++-
 taichi/ir/frontend_ir.cpp                 | 4 ++--
 taichi/ir/frontend_ir.h                   | 4 ++--
 taichi/python/export_lang.cpp             | 2 +-
 7 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/taichi/lang/ast/ast_transformer.py b/python/taichi/lang/ast/ast_transformer.py
index 76015ac9efd94..e565045cb109c 100644
--- a/python/taichi/lang/ast/ast_transformer.py
+++ b/python/taichi/lang/ast/ast_transformer.py
@@ -97,6 +97,7 @@ def build_Assign(ctx, node):
         # Ref https://github.com/taichi-dev/taichi/issues/2659.
         values = node.value.ptr if is_static_assign else impl.expr_init(
             node.value.ptr)
+
         for node_target in node.targets:
             ASTTransformer.build_assign_unpack(ctx, node_target, values,
                                                is_static_assign)
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index 9349b346af93a..e032e38eaeb5a 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -35,7 +35,7 @@ def expr_init_local_tensor(shape, element_type, elements):
 
 @taichi_scope
 def expr_init_scratch_pad(shape, element_type):
-    return get_runtime().prog.current_ast_builder().expr_alloca_scratch_pad(
+    return get_runtime().prog.current_ast_builder().expr_alloca_shared_array(
         shape, element_type)
 
 
diff --git a/python/taichi/lang/simt/__init__.py b/python/taichi/lang/simt/__init__.py
index b7a0be5dea542..aae40ddf381eb 100644
--- a/python/taichi/lang/simt/__init__.py
+++ b/python/taichi/lang/simt/__init__.py
@@ -1,3 +1,4 @@
 from taichi.lang.simt import block, grid, subgroup, warp
+#from taichi.lang.simt.block import SharedArray
 
 __all__ = ['warp', 'subgroup', 'block', 'grid']
diff --git a/python/taichi/lang/simt/block.py b/python/taichi/lang/simt/block.py
index 97b698b6bf164..8316e3d35bbc4 100644
--- a/python/taichi/lang/simt/block.py
+++ b/python/taichi/lang/simt/block.py
@@ -16,4 +16,5 @@ def __init__(self, shape, dtype):
 
     @taichi_scope
     def _subscript(self, indices, get_ref=False):
-        return impl.make_index_expr(self.shared_array_proxy, (indices, ))
+        return impl.make_index_expr(self.shared_array_proxy, (indices,))
+    
\ No newline at end of file
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index 6b7927617cff9..56d5de025a9e8 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -985,8 +985,8 @@ Expr ASTBuilder::expr_alloca_local_tensor(const std::vector<int> &shape,
   return var;
 }
 
-Expr ASTBuilder::expr_alloca_scratch_pad(const std::vector<int> &shape,
-                                         const DataType &element_type) {
+Expr ASTBuilder::expr_alloca_shared_array(const std::vector<int> &shape,
+                                          const DataType &element_type) {
   auto var = Expr(std::make_shared<IdExpression>(get_next_id()));
   this->insert(std::make_unique<FrontendAllocaStmt>(
       std::static_pointer_cast<IdExpression>(var.expr)->id, shape, element_type,
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index e2a7f8e84613d..0f831a2fa158e 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -878,7 +878,7 @@ class ASTBuilder {
   Expr expr_alloca_local_tensor(const std::vector<int> &shape,
                                 const DataType &element_type,
                                 const ExprGroup &elements);
-  Expr expr_alloca_scratch_pad(const std::vector<int> &shape,
+  Expr expr_alloca_shared_array(const std::vector<int> &shape,
                                const DataType &element_type);
   void expr_assign(const Expr &lhs, const Expr &rhs, std::string tb);
   void create_assert_stmt(const Expr &cond,
@@ -913,7 +913,7 @@ class ASTBuilder {
   }
 
   void block_dim(int v) {
-    TI_ASSERT(bit::is_power_of_two(v));
+    //TI_ASSERT(bit::is_power_of_two(v));
     for_loop_dec_.config.block_dim = v;
   }
 
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 609ebd36c931e..edf26ec7078c5 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -303,7 +303,7 @@ void export_lang(py::module &m) {
       .def("insert_external_func_call", &ASTBuilder::insert_external_func_call)
       .def("expr_alloca", &ASTBuilder::expr_alloca)
       .def("expr_alloca_local_tensor", &ASTBuilder::expr_alloca_local_tensor)
-      .def("expr_alloca_scratch_pad", &ASTBuilder::expr_alloca_scratch_pad)
+      .def("expr_alloca_shared_array", &ASTBuilder::expr_alloca_shared_array)
       .def("create_assert_stmt", &ASTBuilder::create_assert_stmt)
       .def("expr_assign", &ASTBuilder::expr_assign)
       .def("begin_frontend_range_for", &ASTBuilder::begin_frontend_range_for)

From f5117e229e1f511dfb18db14a23fae8b0cae8c26 Mon Sep 17 00:00:00 2001
From: Haidong Lan <haidonglan@taichi.graphics>
Date: Mon, 18 Jul 2022 11:09:52 +0800
Subject: [PATCH 04/11] format

---
 python/taichi/lang/simt/__init__.py | 1 +
 python/taichi/lang/simt/block.py    | 3 +--
 taichi/ir/frontend_ir.h             | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/taichi/lang/simt/__init__.py b/python/taichi/lang/simt/__init__.py
index aae40ddf381eb..a8c121c1e3d90 100644
--- a/python/taichi/lang/simt/__init__.py
+++ b/python/taichi/lang/simt/__init__.py
@@ -1,4 +1,5 @@
 from taichi.lang.simt import block, grid, subgroup, warp
+
 #from taichi.lang.simt.block import SharedArray
 
 __all__ = ['warp', 'subgroup', 'block', 'grid']
diff --git a/python/taichi/lang/simt/block.py b/python/taichi/lang/simt/block.py
index 8316e3d35bbc4..97b698b6bf164 100644
--- a/python/taichi/lang/simt/block.py
+++ b/python/taichi/lang/simt/block.py
@@ -16,5 +16,4 @@ def __init__(self, shape, dtype):
 
     @taichi_scope
     def _subscript(self, indices, get_ref=False):
-        return impl.make_index_expr(self.shared_array_proxy, (indices,))
-    
\ No newline at end of file
+        return impl.make_index_expr(self.shared_array_proxy, (indices, ))
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index 0f831a2fa158e..4bbdb26f5219f 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -879,7 +879,7 @@ class ASTBuilder {
                                 const DataType &element_type,
                                 const ExprGroup &elements);
   Expr expr_alloca_shared_array(const std::vector<int> &shape,
-                               const DataType &element_type);
+                                const DataType &element_type);
   void expr_assign(const Expr &lhs, const Expr &rhs, std::string tb);
   void create_assert_stmt(const Expr &cond,
                           const std::string &msg,
@@ -913,7 +913,7 @@ class ASTBuilder {
   }
 
   void block_dim(int v) {
-    //TI_ASSERT(bit::is_power_of_two(v));
+    // TI_ASSERT(bit::is_power_of_two(v));
     for_loop_dec_.config.block_dim = v;
   }
 

From 3ce855af84709505d9d6a7233d52a01c9971db77 Mon Sep 17 00:00:00 2001
From: Haidong Lan <haidonglan@taichi.graphics>
Date: Mon, 18 Jul 2022 11:39:24 +0800
Subject: [PATCH 05/11] Minor patches

---
 python/taichi/lang/impl.py          | 2 +-
 python/taichi/lang/simt/__init__.py | 2 --
 python/taichi/lang/simt/block.py    | 2 +-
 taichi/ir/frontend_ir.h             | 6 +++++-
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index e032e38eaeb5a..4168454898c43 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -34,7 +34,7 @@ def expr_init_local_tensor(shape, element_type, elements):
 
 
 @taichi_scope
-def expr_init_scratch_pad(shape, element_type):
+def expr_init_shared_array(shape, element_type):
     return get_runtime().prog.current_ast_builder().expr_alloca_shared_array(
         shape, element_type)
 
diff --git a/python/taichi/lang/simt/__init__.py b/python/taichi/lang/simt/__init__.py
index a8c121c1e3d90..b7a0be5dea542 100644
--- a/python/taichi/lang/simt/__init__.py
+++ b/python/taichi/lang/simt/__init__.py
@@ -1,5 +1,3 @@
 from taichi.lang.simt import block, grid, subgroup, warp
 
-#from taichi.lang.simt.block import SharedArray
-
 __all__ = ['warp', 'subgroup', 'block', 'grid']
diff --git a/python/taichi/lang/simt/block.py b/python/taichi/lang/simt/block.py
index 97b698b6bf164..1d51a1a784dd3 100644
--- a/python/taichi/lang/simt/block.py
+++ b/python/taichi/lang/simt/block.py
@@ -12,7 +12,7 @@ class SharedArray:
     def __init__(self, shape, dtype):
         self.shape = shape
         self.dtype = dtype
-        self.shared_array_proxy = impl.expr_init_scratch_pad(shape, dtype)
+        self.shared_array_proxy = impl.expr_init_shared_array(shape, dtype)
 
     @taichi_scope
     def _subscript(self, indices, get_ref=False):
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index 4bbdb26f5219f..1e70069dd7b9f 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -913,7 +913,11 @@ class ASTBuilder {
   }
 
   void block_dim(int v) {
-    // TI_ASSERT(bit::is_power_of_two(v));
+    if (arch_ == Arch::cuda) {
+      TI_ASSERT(v % 32 == 0);
+    } else {
+      TI_ASSERT(bit::is_power_of_two(v));
+    }
     for_loop_dec_.config.block_dim = v;
   }
 

From b7382cf194923c73408bdddf6153a3d14ca2f427 Mon Sep 17 00:00:00 2001
From: Haidong Lan <haidonglan@taichi.graphics>
Date: Mon, 18 Jul 2022 15:14:41 +0800
Subject: [PATCH 06/11] Add a simple test

---
 tests/python/test_shared_array.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 tests/python/test_shared_array.py

diff --git a/tests/python/test_shared_array.py b/tests/python/test_shared_array.py
new file mode 100644
index 0000000000000..3f87072e0444f
--- /dev/null
+++ b/tests/python/test_shared_array.py
@@ -0,0 +1,26 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=ti.cuda)
+def test_shared_array_save():
+    block_dim=128
+    pad_num = 16
+    a = ti.field(dtype=ti.f32, shape=(block_dim * pad_num,))
+    @ti.kernel
+    def func():
+        ti.loop_config(block_dim=block_dim)
+        for i in range(block_dim * pad_num):
+            g_tid = ti.global_thread_idx()
+            tid = g_tid % block_dim
+            pad = ti.simt.block.SharedArray((block_dim,), ti.f32)
+            pad[tid] = tid * 2.0
+            ti.simt.block.sync()
+            a[i] = pad[tid]
+            ti.simt.block.sync()
+    func()
+    print(a)
+    for i in range(pad_num):
+        assert a[i * block_dim + 7] == 14.0
+        assert a[i * block_dim + 29] == 58.0
+        assert a[i * block_dim + 127] == 254.0

From bb029026794b9384c392eb5a2f366143e2e159da Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 18 Jul 2022 07:16:47 +0000
Subject: [PATCH 07/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/python/test_shared_array.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/python/test_shared_array.py b/tests/python/test_shared_array.py
index 3f87072e0444f..47c8613b83e19 100644
--- a/tests/python/test_shared_array.py
+++ b/tests/python/test_shared_array.py
@@ -4,20 +4,22 @@
 
 @test_utils.test(arch=ti.cuda)
 def test_shared_array_save():
-    block_dim=128
+    block_dim = 128
     pad_num = 16
-    a = ti.field(dtype=ti.f32, shape=(block_dim * pad_num,))
+    a = ti.field(dtype=ti.f32, shape=(block_dim * pad_num, ))
+
     @ti.kernel
     def func():
         ti.loop_config(block_dim=block_dim)
         for i in range(block_dim * pad_num):
             g_tid = ti.global_thread_idx()
             tid = g_tid % block_dim
-            pad = ti.simt.block.SharedArray((block_dim,), ti.f32)
+            pad = ti.simt.block.SharedArray((block_dim, ), ti.f32)
             pad[tid] = tid * 2.0
             ti.simt.block.sync()
             a[i] = pad[tid]
             ti.simt.block.sync()
+
     func()
     print(a)
     for i in range(pad_num):

From dfffafe77ed737b71eec57b1f72b471dbe0eba98 Mon Sep 17 00:00:00 2001
From: Haidong Lan <haidonglan@taichi.graphics>
Date: Mon, 18 Jul 2022 15:42:02 +0800
Subject: [PATCH 08/11] minor fix

---
 tests/python/test_shared_array.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/test_shared_array.py b/tests/python/test_shared_array.py
index 3f87072e0444f..f511d92992d64 100644
--- a/tests/python/test_shared_array.py
+++ b/tests/python/test_shared_array.py
@@ -19,7 +19,6 @@ def func():
             a[i] = pad[tid]
             ti.simt.block.sync()
     func()
-    print(a)
     for i in range(pad_num):
         assert a[i * block_dim + 7] == 14.0
         assert a[i * block_dim + 29] == 58.0

From 95a41dd9b8c666e5481199cf5e66577f5cc108b5 Mon Sep 17 00:00:00 2001
From: Haidong Lan <haidonglan@taichi.graphics>
Date: Tue, 19 Jul 2022 16:22:45 +0800
Subject: [PATCH 09/11] Re-enable CUDA launch parameter for small block dim

---
 taichi/ir/frontend_ir.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index 1e70069dd7b9f..9859f6a9cb63d 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -914,7 +914,7 @@ class ASTBuilder {
 
   void block_dim(int v) {
     if (arch_ == Arch::cuda) {
-      TI_ASSERT(v % 32 == 0);
+      TI_ASSERT((v % 32 == 0) || bit::is_power_of_two(v));
     } else {
       TI_ASSERT(bit::is_power_of_two(v));
     }

From c1bbaee138b33c85f04c9ddbb4f7baa8b6bb104e Mon Sep 17 00:00:00 2001
From: Haidong Lan <haidonglan@taichi.graphics>
Date: Wed, 20 Jul 2022 18:11:59 +0800
Subject: [PATCH 10/11] Minor fixes: shared memory and indices reference

---
 python/taichi/lang/simt/block.py | 2 +-
 taichi/transforms/ir_printer.cpp | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/taichi/lang/simt/block.py b/python/taichi/lang/simt/block.py
index 1d51a1a784dd3..0baf3f352b5de 100644
--- a/python/taichi/lang/simt/block.py
+++ b/python/taichi/lang/simt/block.py
@@ -15,5 +15,5 @@ def __init__(self, shape, dtype):
         self.shared_array_proxy = impl.expr_init_shared_array(shape, dtype)
 
     @taichi_scope
-    def _subscript(self, indices, get_ref=False):
+    def _subscript(self, *indices, get_ref=False):
         return impl.make_index_expr(self.shared_array_proxy, (indices, ))
diff --git a/taichi/transforms/ir_printer.cpp b/taichi/transforms/ir_printer.cpp
index 76eb3f28153b9..b3041bb2acf34 100644
--- a/taichi/transforms/ir_printer.cpp
+++ b/taichi/transforms/ir_printer.cpp
@@ -119,9 +119,9 @@ class IRPrinter : public IRVisitor {
   }
 
   void visit(FrontendAllocaStmt *alloca) override {
-    std::string shared_suffix = (alloca->is_shared) ? " shared" : "";
-    print("{}${} = alloca {}{}", alloca->type_hint(), alloca->id,
-          alloca->ident.name(), shared_suffix);
+    std::string shared_suffix = (alloca->is_shared) ? "(shared)" : "";
+    print("{}${} = alloca{} {}", alloca->type_hint(), alloca->id,
+          shared_suffix, alloca->ident.name());
   }
 
   void visit(FrontendAssertStmt *assert) override {
@@ -188,7 +188,7 @@ class IRPrinter : public IRVisitor {
   }
 
   void visit(AllocaStmt *alloca) override {
-    std::string shared_suffix = (alloca->is_shared) ? " shared" : "";
+    std::string shared_suffix = (alloca->is_shared) ? "(shared)" : "";
     print("{}${} = alloca{}", alloca->type_hint(), alloca->id, shared_suffix);
   }
 

From eb862e3c83edf069d757046d5865f053ede9839b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 20 Jul 2022 10:13:32 +0000
Subject: [PATCH 11/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 taichi/transforms/ir_printer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/taichi/transforms/ir_printer.cpp b/taichi/transforms/ir_printer.cpp
index b3041bb2acf34..ca462e42773e8 100644
--- a/taichi/transforms/ir_printer.cpp
+++ b/taichi/transforms/ir_printer.cpp
@@ -120,8 +120,8 @@ class IRPrinter : public IRVisitor {
 
   void visit(FrontendAllocaStmt *alloca) override {
     std::string shared_suffix = (alloca->is_shared) ? "(shared)" : "";
-    print("{}${} = alloca{} {}", alloca->type_hint(), alloca->id,
-          shared_suffix, alloca->ident.name());
+    print("{}${} = alloca{} {}", alloca->type_hint(), alloca->id, shared_suffix,
+          alloca->ident.name());
   }
 
   void visit(FrontendAssertStmt *assert) override {