diff --git a/benchmarks/microbenchmarks/__init__.py b/benchmarks/microbenchmarks/__init__.py
new file mode 100644
index 0000000000000..c76b59c8b0385
--- /dev/null
+++ b/benchmarks/microbenchmarks/__init__.py
@@ -0,0 +1,3 @@
+from .fill import FillPlan
+
+benchmark_plan_list = [FillPlan]
diff --git a/benchmarks/microbenchmarks/_items.py b/benchmarks/microbenchmarks/_items.py
new file mode 100644
index 0000000000000..78af0ebd27eeb
--- /dev/null
+++ b/benchmarks/microbenchmarks/_items.py
@@ -0,0 +1,41 @@
+from microbenchmarks._utils import size2tag
+
+import taichi as ti
+
+
+class BenchmarkItem:
+    name = 'item'
+
+    def __init__(self):
+        self._items = {}  # {'tag': impl, ...}
+
+    def get(self):
+        return self._items
+
+    def get_tags(self):
+        return list(self._items.keys())
+
+    def impl(self, tag: str):
+        return self._items[tag]
+
+
+class DataType(BenchmarkItem):
+    name = 'dtype'
+
+    def __init__(self):
+        self._items = {
+            str(ti.i32): ti.i32,
+            str(ti.i64): ti.i64,
+            str(ti.f32): ti.f32,
+            str(ti.f64): ti.f64
+        }
+
+
+class DataSize(BenchmarkItem):
+    name = 'dsize'
+
+    def __init__(self):
+        self._items = {}
+        for i in range(1, 10):  # [4KB,16KB...256MB]
+            size_bytes = (4**i) * 1024  # kibibytes(KiB) = 1024
+            self._items[size2tag(size_bytes)] = size_bytes
diff --git a/benchmarks/microbenchmarks/_plan.py b/benchmarks/microbenchmarks/_plan.py
new file mode 100644
index 0000000000000..633a2cb6ee580
--- /dev/null
+++ b/benchmarks/microbenchmarks/_plan.py
@@ -0,0 +1,58 @@
+import itertools
+
+from microbenchmarks._result import ResultType
+from microbenchmarks._utils import get_ti_arch, tags2name
+
+import taichi as ti
+
+
+class BenchmarkPlan:
+    def __init__(self, name='plan', arch='x64', basic_repeat_times=1):
+        self.name = name
+        self.arch = arch
+        self.basic_repeat_times = basic_repeat_times
+        self.info = {'name': self.name}
+        self.plan = {}  # {'tags': [...], 'result': None}
+        self.items = []
+        self.func = None
+
+    def create_plan(self, *items):
+        self.items = list(items)
+        items_list = [[self.name]]
+        for item in self.items:
+            items_list.append(item.get_tags())
+            self.info[item.name] = item.get_tags()
+        case_list = list(itertools.product(*items_list))  #items generate cases
+        for tags in case_list:
+            self.plan[tags2name(tags)] = {'tags': tags, 'result': None}
+
+    def set_func(self, func):
+        self.func = func
+
+    def run(self):
+        for case, plan in self.plan.items():
+            tag_list = plan['tags']
+            self._init_taichi(self.arch, tag_list)
+            _ms = self._run_func(tag_list)
+            plan['result'] = _ms
+            print(f'{tag_list}={_ms}')
+            ti.reset()
+        rdict = {'results': self.plan, 'info': self.info}
+        return rdict
+
+    def _get_kwargs(self, tags):
+        kwargs = {}
+        tags = tags[1:]  # tags = [case_name, item1_tag, item2_tag, ...]
+        for item, tag in zip(self.items, tags):
+            kwargs[item.name] = item.impl(tag)
+        return kwargs
+
+    def _init_taichi(self, arch, tags):
+        for tag in tags:
+            if ResultType.init_taichi(arch, tag):
+                return True
+        return False
+
+    def _run_func(self, tags: list):
+        return self.func(self.arch, self.basic_repeat_times,
+                         **self._get_kwargs(tags))
diff --git a/benchmarks/microbenchmarks/_result.py b/benchmarks/microbenchmarks/_result.py
new file mode 100644
index 0000000000000..50a34d1aed4fa
--- /dev/null
+++ b/benchmarks/microbenchmarks/_result.py
@@ -0,0 +1,28 @@
+from microbenchmarks._items import BenchmarkItem
+from microbenchmarks._utils import get_ti_arch
+
+import taichi as ti
+
+
+def kernel_executor(repeat, func, *args):
+    # compile & warmup
+    for i in range(repeat):
+        func(*args)
+    ti.clear_kernel_profile_info()
+    for i in range(repeat):
+        func(*args)
+    return ti.kernel_profiler_total_time() * 1000 / repeat  #ms
+
+
+class ResultType(BenchmarkItem):
+    name = 'get_result'
+
+    def __init__(self):
+        self._items = {'kernel_elapsed_time_ms': kernel_executor}
+
+    @staticmethod
+    def init_taichi(arch: str, result_tag: str):
+        if result_tag == 'kernel_elapsed_time_ms':
+            ti.init(kernel_profiler=True, arch=get_ti_arch(arch))
+            return True
+        return False
diff --git a/benchmarks/microbenchmarks/_utils.py b/benchmarks/microbenchmarks/_utils.py
new file mode 100644
index 0000000000000..b8bd74489bceb
--- /dev/null
+++ b/benchmarks/microbenchmarks/_utils.py
@@ -0,0 +1,44 @@
+from taichi._lib import core as ti_core
+
+import taichi as ti
+
+
+def size2tag(size_in_byte):
+    size_subsection = [(0.0, 'B'), (1024.0, 'KB'), (1048576.0, 'MB'),
+                       (1073741824.0, 'GB'),
+                       (float('inf'), 'INF')]  #B KB MB GB
+    for dsize, unit in reversed(size_subsection):
+        if size_in_byte >= dsize:
+            return str(int(size_in_byte / dsize)) + unit
+
+
+def tags2name(tag_list):
+    return '_'.join(tag_list)
+
+
+def dtype_size(ti_dtype):
+    dtype_size_dict = {ti.i32: 4, ti.i64: 8, ti.f32: 4, ti.f64: 8}
+    if ti_dtype not in dtype_size_dict:
+        raise RuntimeError('Unsupported ti.dtype: ' + str(type(ti_dtype)))
+    else:
+        return dtype_size_dict[ti_dtype]
+
+
+def get_ti_arch(arch: str):
+    arch_dict = {
+        'cuda': ti.cuda,
+        'vulkan': ti.vulkan,
+        'opengl': ti.opengl,
+        'metal': ti.metal,
+        'x64': ti.x64,
+        'cc': ti.cc
+    }
+    return arch_dict[arch]
+
+
+def scaled_repeat_times(arch: str, datasize, repeat=1):
+    if (arch == 'cuda') | (arch == 'vulkan') | (arch == 'opengl'):
+        repeat *= 10
+    if datasize <= 4 * 1024 * 1024:
+        repeat *= 10
+    return repeat
diff --git a/benchmarks/microbenchmarks/fill.py b/benchmarks/microbenchmarks/fill.py
new file mode 100644
index 0000000000000..b7533dc631a02
--- /dev/null
+++ b/benchmarks/microbenchmarks/fill.py
@@ -0,0 +1,25 @@
+from microbenchmarks._items import DataSize, DataType
+from microbenchmarks._plan import BenchmarkPlan
+from microbenchmarks._result import ResultType
+from microbenchmarks._utils import dtype_size, scaled_repeat_times
+
+import taichi as ti
+
+
+def fill_default(arch, repeat, dtype, dsize, get_result):
+    @ti.kernel
+    def fill_field(dst: ti.template()):
+        for I in ti.grouped(dst):
+            dst[I] = ti.cast(0.7, dtype)
+
+    repeat = scaled_repeat_times(arch, dsize, repeat)
+    num_elements = dsize // dtype_size(dtype)
+    x = ti.field(dtype, num_elements)
+    return get_result(repeat, fill_field, x)
+
+
+class FillPlan(BenchmarkPlan):
+    def __init__(self, arch: str):
+        super().__init__('fill', arch, basic_repeat_times=10)
+        self.create_plan(DataType(), DataSize(), ResultType())
+        self.set_func(fill_default)