diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index 43940fce55439..4168454898c43 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -19,6 +19,7 @@
                               MeshRelationAccessProxy,
                               MeshReorderedMatrixFieldProxy,
                               MeshReorderedScalarFieldProxy, element_type_name)
+from taichi.lang.simt.block import SharedArray
 from taichi.lang.snode import SNode
 from taichi.lang.struct import Struct, StructField, _IntermediateStruct
 from taichi.lang.util import (cook_dtype, get_traceback, is_taichi_class,
@@ -32,6 +33,12 @@ def expr_init_local_tensor(shape, element_type, elements):
         shape, element_type, elements)
 
 
+@taichi_scope
+def expr_init_shared_array(shape, element_type):
+    return get_runtime().prog.current_ast_builder().expr_alloca_shared_array(
+        shape, element_type)
+
+
 @taichi_scope
 def expr_init(rhs):
     if rhs is None:
@@ -40,6 +47,8 @@ def expr_init(rhs):
         return type(rhs)(*rhs.to_list())
     if isinstance(rhs, Matrix):
         return Matrix(rhs.to_list())
+    if isinstance(rhs, SharedArray):
+        return rhs
     if isinstance(rhs, Struct):
         return Struct(rhs.to_dict(include_methods=True))
     if isinstance(rhs, list):
diff --git a/python/taichi/lang/simt/block.py b/python/taichi/lang/simt/block.py
index 427664473dd5d..0baf3f352b5de 100644
--- a/python/taichi/lang/simt/block.py
+++ b/python/taichi/lang/simt/block.py
@@ -1,5 +1,19 @@
 from taichi.lang import impl
+from taichi.lang.util import taichi_scope
 
 
 def sync():
     return impl.call_internal("block_barrier", with_runtime_context=False)
+
+
+class SharedArray:
+    _is_taichi_class = True
+
+    def __init__(self, shape, dtype):
+        self.shape = shape
+        self.dtype = dtype
+        self.shared_array_proxy = impl.expr_init_shared_array(shape, dtype)
+
+    @taichi_scope
+    def _subscript(self, *indices, get_ref=False):
+        return impl.make_index_expr(self.shared_array_proxy, (indices, ))
diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index 6966c81f44d84..bb357a77de202 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -128,7 +128,29 @@ void CodeGenLLVM::visit(AllocaStmt *stmt) {
     auto type = tlctx->get_data_type(tensor_type->get_element_type());
     auto array_size = tlctx->get_constant(tensor_type->get_num_elements());
     // Return type is [array_size x type]*.
-    llvm_val[stmt] = create_entry_block_alloca(type, 0, array_size);
+    if (stmt->is_shared) {
+      size_t data_element_size = tlctx->get_type_size(
+          tlctx->get_data_type(tensor_type->get_element_type()));
+      auto type = llvm::ArrayType::get(
+          llvm::Type::getInt8Ty(*llvm_context),
+          data_element_size * tensor_type->get_num_elements());
+      auto base = new llvm::GlobalVariable(
+          *module, type, false, llvm::GlobalValue::ExternalLinkage, nullptr,
+          fmt::format("shared_array_{}", stmt->id), nullptr,
+          llvm::GlobalVariable::NotThreadLocal, 3 /*addrspace=shared*/);
+      base->setAlignment(llvm::MaybeAlign(8));
+
+      auto ptr = builder->CreateGEP(
+#ifdef TI_LLVM_15
+          base->getValueType(),
+#endif
+          base, {tlctx->get_constant(0), tlctx->get_constant(0)});
+      auto ptr_type = llvm::PointerType::get(
+          tlctx->get_data_type(tensor_type->get_element_type()), 0);
+      llvm_val[stmt] = builder->CreatePointerCast(ptr, ptr_type);
+    } else {
+      llvm_val[stmt] = create_entry_block_alloca(type, 0, array_size);
+    }
   } else {
     TI_ASSERT(stmt->width() == 1);
     llvm_val[stmt] =
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index f2c03ea18c99d..56d5de025a9e8 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -985,6 +985,16 @@ Expr ASTBuilder::expr_alloca_local_tensor(const std::vector<int> &shape,
   return var;
 }
 
+Expr ASTBuilder::expr_alloca_shared_array(const std::vector<int> &shape,
+                                          const DataType &element_type) {
+  auto var = Expr(std::make_shared<IdExpression>(get_next_id()));
+  this->insert(std::make_unique<FrontendAllocaStmt>(
+      std::static_pointer_cast<IdExpression>(var.expr)->id, shape, element_type,
+      true));
+  var->ret_type = this->get_last_stmt()->ret_type;
+  return var;
+}
+
 void ASTBuilder::expr_assign(const Expr &lhs, const Expr &rhs, std::string tb) {
   TI_ASSERT(lhs->is_lvalue());
   auto stmt = std::make_unique<FrontendAssignStmt>(lhs, rhs);
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index fdbe86c95c620..9859f6a9cb63d 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -63,17 +63,21 @@ class FrontendAllocaStmt : public Stmt {
  public:
   Identifier ident;
 
-  FrontendAllocaStmt(const Identifier &lhs, DataType type) : ident(lhs) {
+  FrontendAllocaStmt(const Identifier &lhs, DataType type)
+      : ident(lhs), is_shared(false) {
     ret_type = TypeFactory::create_vector_or_scalar_type(1, type);
   }
 
   FrontendAllocaStmt(const Identifier &lhs,
                      std::vector<int> shape,
-                     DataType element)
-      : ident(lhs) {
+                     DataType element,
+                     bool is_shared = false)
+      : ident(lhs), is_shared(is_shared) {
     ret_type = DataType(TypeFactory::create_tensor_type(shape, element));
   }
 
+  bool is_shared;
+
   TI_DEFINE_ACCEPT
 };
 
@@ -874,6 +878,8 @@ class ASTBuilder {
   Expr expr_alloca_local_tensor(const std::vector<int> &shape,
                                 const DataType &element_type,
                                 const ExprGroup &elements);
+  Expr expr_alloca_shared_array(const std::vector<int> &shape,
+                                const DataType &element_type);
   void expr_assign(const Expr &lhs, const Expr &rhs, std::string tb);
   void create_assert_stmt(const Expr &cond,
                           const std::string &msg,
@@ -907,7 +913,11 @@ class ASTBuilder {
   }
 
   void block_dim(int v) {
-    TI_ASSERT(bit::is_power_of_two(v));
+    if (arch_ == Arch::cuda) {
+      TI_ASSERT((v % 32 == 0) || bit::is_power_of_two(v));
+    } else {
+      TI_ASSERT(bit::is_power_of_two(v));
+    }
     for_loop_dec_.config.block_dim = v;
   }
 
diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
index 0ffa885f1fb27..02937e9849d1f 100644
--- a/taichi/ir/statements.h
+++ b/taichi/ir/statements.h
@@ -18,17 +18,20 @@ class Function;
  */
 class AllocaStmt : public Stmt {
  public:
-  AllocaStmt(DataType type) {
+  AllocaStmt(DataType type) : is_shared(false) {
     ret_type = TypeFactory::create_vector_or_scalar_type(1, type);
     TI_STMT_REG_FIELDS;
   }
 
-  AllocaStmt(int width, DataType type) {
+  AllocaStmt(int width, DataType type) : is_shared(false) {
     ret_type = TypeFactory::create_vector_or_scalar_type(width, type);
     TI_STMT_REG_FIELDS;
   }
 
-  AllocaStmt(const std::vector<int> &shape, DataType type) {
+  AllocaStmt(const std::vector<int> &shape,
+             DataType type,
+             bool is_shared = false)
+      : is_shared(is_shared) {
     ret_type = TypeFactory::create_tensor_type(shape, type);
     TI_STMT_REG_FIELDS;
   }
@@ -41,7 +44,8 @@ class AllocaStmt : public Stmt {
     return false;
   }
 
-  TI_STMT_DEF_FIELDS(ret_type);
+  bool is_shared;
+  TI_STMT_DEF_FIELDS(ret_type, is_shared);
   TI_DEFINE_ACCEPT_AND_CLONE
 };
 
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index a2ea36d1ecfc2..edf26ec7078c5 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -303,6 +303,7 @@ void export_lang(py::module &m) {
       .def("insert_external_func_call", &ASTBuilder::insert_external_func_call)
       .def("expr_alloca", &ASTBuilder::expr_alloca)
       .def("expr_alloca_local_tensor", &ASTBuilder::expr_alloca_local_tensor)
+      .def("expr_alloca_shared_array", &ASTBuilder::expr_alloca_shared_array)
       .def("create_assert_stmt", &ASTBuilder::create_assert_stmt)
       .def("expr_assign", &ASTBuilder::expr_assign)
       .def("begin_frontend_range_for", &ASTBuilder::begin_frontend_range_for)
diff --git a/taichi/transforms/ir_printer.cpp b/taichi/transforms/ir_printer.cpp
index eb94695e83125..ca462e42773e8 100644
--- a/taichi/transforms/ir_printer.cpp
+++ b/taichi/transforms/ir_printer.cpp
@@ -119,7 +119,8 @@ class IRPrinter : public IRVisitor {
   }
 
   void visit(FrontendAllocaStmt *alloca) override {
-    print("{}${} = alloca {}", alloca->type_hint(), alloca->id,
+    std::string shared_suffix = (alloca->is_shared) ? "(shared)" : "";
+    print("{}${} = alloca{} {}", alloca->type_hint(), alloca->id, shared_suffix,
           alloca->ident.name());
   }
 
@@ -187,7 +188,8 @@ class IRPrinter : public IRVisitor {
   }
 
   void visit(AllocaStmt *alloca) override {
-    print("{}${} = alloca", alloca->type_hint(), alloca->id);
+    std::string shared_suffix = (alloca->is_shared) ? "(shared)" : "";
+    print("{}${} = alloca{}", alloca->type_hint(), alloca->id, shared_suffix);
   }
 
   void visit(RandStmt *stmt) override {
diff --git a/taichi/transforms/lower_ast.cpp b/taichi/transforms/lower_ast.cpp
index e7c390ce3c41e..90e33cba4e01b 100644
--- a/taichi/transforms/lower_ast.cpp
+++ b/taichi/transforms/lower_ast.cpp
@@ -71,7 +71,8 @@ class LowerAST : public IRVisitor {
     if (stmt->ret_type->is<TensorType>()) {
       auto tensor_type = stmt->ret_type->cast<TensorType>();
       auto lowered = std::make_unique<AllocaStmt>(
-          tensor_type->get_shape(), tensor_type->get_element_type());
+          tensor_type->get_shape(), tensor_type->get_element_type(),
+          stmt->is_shared);
       block->local_var_to_stmt.insert(std::make_pair(ident, lowered.get()));
       stmt->parent->replace_with(stmt, std::move(lowered));
     } else {
diff --git a/tests/python/test_shared_array.py b/tests/python/test_shared_array.py
new file mode 100644
index 0000000000000..64aa418807bea
--- /dev/null
+++ b/tests/python/test_shared_array.py
@@ -0,0 +1,27 @@
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=ti.cuda)
+def test_shared_array_save():
+    block_dim = 128
+    pad_num = 16
+    a = ti.field(dtype=ti.f32, shape=(block_dim * pad_num, ))
+
+    @ti.kernel
+    def func():
+        ti.loop_config(block_dim=block_dim)
+        for i in range(block_dim * pad_num):
+            g_tid = ti.global_thread_idx()
+            tid = g_tid % block_dim
+            pad = ti.simt.block.SharedArray((block_dim, ), ti.f32)
+            pad[tid] = tid * 2.0
+            ti.simt.block.sync()
+            a[i] = pad[tid]
+            ti.simt.block.sync()
+
+    func()
+    for i in range(pad_num):
+        assert a[i * block_dim + 7] == 14.0
+        assert a[i * block_dim + 29] == 58.0
+        assert a[i * block_dim + 127] == 254.0